import pandas as pd
from datetime import datetime
sf_file = "/home/sajit/coursera/ds_scale/systems_algos/assignments/datasci_course_materials/assignment6/sanfrancisco_incidents_summer_2014.csv"
sf = pd.read_csv(sf_file)
seattle_file = "/home/sajit/coursera/ds_scale/systems_algos/assignments/datasci_course_materials/assignment6/seattle_incidents_summer_2014.csv"
seattle = pd.read_csv(seattle_file)
seattle.columns = [c.replace(' ', '_') for c in seattle.columns]
seattle.columns = [c.replace('/', '_') for c in seattle.columns]
# Drop all columns that we won't be using
seattle.drop(['RMS_CDW_ID','General_Offense_Number', 'Offense_Code',
'Offense_Code_Extension', 'Offense_Type', 'Summary_Offense_Code','Date_Reported', 'Occurred_Date_Range_End',
'Hundred_Block_Location', 'Zone_Beat',
'Census_Tract_2000', 'Longitude', 'Latitude', 'Location','Year'],inplace=True,axis=1,errors='ignore')
seattle.columns.values
day_of_week_dict = {
0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday'
}
month_dict = {
6 : 'June',
7: 'July',
8: 'August'
}
def seattle_mapper(x):
format = '%m/%d/%Y %I:%M:%S %p'
my_date = datetime.strptime(x,format)
hour_slot = 0
my_hour = my_date.hour
if 6 <= my_hour < 12:
hour_slot = 1
elif 12 <= my_hour < 18:
hour_slot = 2
elif my_hour >= 18:
hour_slot = 3
return pd.Series({'time_of_day': hour_slot, 'day_of_week': day_of_week_dict[my_date.weekday()], 'date': my_date.date(),
'month_label': month_dict[my_date.month]})
newcols = seattle['Occurred_Date_or_Date_Range_Start'].apply(seattle_mapper)
seattle = seattle.join(newcols)
seattle.columns.values
def mapper_sf_time(x):
hour = int(x.split(':')[0])
if 0<= hour < 6:
return 0;
elif 6<= hour < 12:
return 1;
elif 12 <= hour < 18:
return 2;
else:
return 3
def sf_map_date(x):
format = '%m/%d/%Y'
my_date = datetime.strptime(x,format)
return month_dict[my_date.month]
sf['month_label'] = sf['Date'].apply(sf_map_date)
sf['time_of_day'] = sf['Time'].apply(mapper_sf_time)
# Drop all columns that we won't be using
sf.drop(['Descript', 'Time',
'Resolution', 'Address', 'X','Y', 'Location',
'PdId'],inplace=True,axis=1,errors='ignore')
sf.columns.values
from bokeh.charts import Bar, output_notebook, show
from bokeh.models import HoverTool
output_notebook()
seattle_time_slot_crimes = Bar(seattle, 'time_of_day', values='Summarized_Offense_Description', stack="Summarized_Offense_Description",title="Seattle:Incidents by time of the day",agg='count',
xlabel="Time of the day",ylabel="Count", tools='hover')
seattle_time_slot_crimes.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Summarized_Offense_Description"}
show(seattle_time_slot_crimes)
sf_time_slot_crimes = Bar(sf, 'time_of_day', values='Category', stack="Category",title="SF:Incidents by time of the day",agg='count',
xlabel="Time of the day",ylabel="Count",tools='hover')
sf_time_slot_crimes.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Category"}
show(sf_time_slot_crimes)
seattle_day_ofweek = Bar(seattle, 'day_of_week', values='Summarized_Offense_Description', stack="Summarized_Offense_Description",title="Seattle:Incidents by day of the week",agg='count',
xlabel="Day of Week",ylabel="Count",tools='hover')
seattle_day_ofweek.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Summarized_Offense_Description"}
show(seattle_day_ofweek)
sf_day_ofweek = Bar(sf, 'DayOfWeek', values='Category', title="SF:Incidents by day of the week",agg='count',
xlabel="Day of Week",ylabel="Count",stack="Category",tools='hover')
sf_day_ofweek.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Category"}
show(sf_day_ofweek)
seattle_month = Bar(seattle, 'month_label', values='Summarized_Offense_Description',stack="Summarized_Offense_Description", title="Seattle:Incidents by month",agg='count',
xlabel="Month",ylabel="Count",tools='hover')
seattle_month.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Summarized_Offense_Description"}
show(seattle_month)
sf_month = Bar(sf, label='month_label', values='Category',stack='Category', title="SF:Incidents by month",agg='count',
xlabel="Month",ylabel="Count",tools='hover')
sf_month.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Category"}
show(sf_month)
seattle_pd = Bar(seattle, 'District_Sector', values='Summarized_Offense_Description',stack='Summarized_Offense_Description', title="Seattle:Incidents by PD",agg='count',
xlabel="PD",ylabel="Count",tools='hover')
seattle_pd.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Summarized_Offense_Description"}
show(seattle_pd)
sf_pd = Bar(sf, 'PdDistrict', values='Category', title="Incidents by PD",agg='count',
xlabel="PD",ylabel="Count",stack="Category",tools="hover")
sf_pd.select(dict(type=HoverTool)).tooltips = {"Offense Type" : "@Category"}
show(sf_pd)
Lets see how crimes vary by the time of the day and day of the week
result = seattle.groupby(['Summarized_Offense_Description','day_of_week','time_of_day'],squeeze=True)['Summarized_Offense_Description'].count()
result['CAR PROWL']
result['ASSAULT']